In [ ]:
Name : Prajakta Ramesh Chavan.
Date:23/02/2024
Domain: Data Science -oasis Infobyte 
Task no:02: Unemployment analysis with Python
In [1]:
#importing the necessary libraries
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import calendar

import datetime as dt

import plotly.io as pio
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
from IPython.display import HTML
In [2]:
df = pd.read_csv(r'C:\Users\Dell\Desktop\dataset\Unemployment_Rate.csv')
df
Out[2]:
Region Date Frequency Estimated Unemployment Rate (%) Estimated Employed Estimated Labour Participation Rate (%) Region.1 longitude latitude
0 Andhra Pradesh 31-01-2020 M 5.48 16635535 41.02 South 15.9129 79.740
1 Andhra Pradesh 29-02-2020 M 5.83 16545652 40.90 South 15.9129 79.740
2 Andhra Pradesh 31-03-2020 M 5.79 15881197 39.18 South 15.9129 79.740
3 Andhra Pradesh 30-04-2020 M 20.51 11336911 33.10 South 15.9129 79.740
4 Andhra Pradesh 31-05-2020 M 17.43 12988845 36.46 South 15.9129 79.740
... ... ... ... ... ... ... ... ... ...
262 West Bengal 30-06-2020 M 7.29 30726310 40.39 East 22.9868 87.855
263 West Bengal 31-07-2020 M 6.83 35372506 46.17 East 22.9868 87.855
264 West Bengal 31-08-2020 M 14.87 33298644 47.48 East 22.9868 87.855
265 West Bengal 30-09-2020 M 9.35 35707239 47.73 East 22.9868 87.855
266 West Bengal 31-10-2020 M 9.98 33962549 45.63 East 22.9868 87.855

267 rows × 9 columns

In [39]:
india = pd.read_csv(r'C:\Users\Dell\Desktop\dataset\Unemployment_in_India.csv')
india
Out[39]:
Region Date Frequency Estimated Unemployment Rate (%) Estimated Employed Estimated Labour Participation Rate (%) Area
0 Andhra Pradesh 31-05-2019 Monthly 3.65 11999139.0 43.24 Rural
1 Andhra Pradesh 30-06-2019 Monthly 3.05 11755881.0 42.05 Rural
2 Andhra Pradesh 31-07-2019 Monthly 3.75 12086707.0 43.50 Rural
3 Andhra Pradesh 31-08-2019 Monthly 3.32 12285693.0 43.97 Rural
4 Andhra Pradesh 30-09-2019 Monthly 5.17 12256762.0 44.68 Rural
... ... ... ... ... ... ... ...
763 NaN NaN NaN NaN NaN NaN NaN
764 NaN NaN NaN NaN NaN NaN NaN
765 NaN NaN NaN NaN NaN NaN NaN
766 NaN NaN NaN NaN NaN NaN NaN
767 NaN NaN NaN NaN NaN NaN NaN

768 rows × 7 columns

In [ ]:
india.head(5)
In [3]:
df.head(5)
Out[3]:
Region Date Frequency Estimated Unemployment Rate (%) Estimated Employed Estimated Labour Participation Rate (%) Region.1 longitude latitude
0 Andhra Pradesh 31-01-2020 M 5.48 16635535 41.02 South 15.9129 79.74
1 Andhra Pradesh 29-02-2020 M 5.83 16545652 40.90 South 15.9129 79.74
2 Andhra Pradesh 31-03-2020 M 5.79 15881197 39.18 South 15.9129 79.74
3 Andhra Pradesh 30-04-2020 M 20.51 11336911 33.10 South 15.9129 79.74
4 Andhra Pradesh 31-05-2020 M 17.43 12988845 36.46 South 15.9129 79.74
In [4]:
df.shape
Out[4]:
(267, 9)
In [40]:
india.shape
Out[40]:
(768, 7)
In [5]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 267 entries, 0 to 266
Data columns (total 9 columns):
 #   Column                                    Non-Null Count  Dtype  
---  ------                                    --------------  -----  
 0   Region                                    267 non-null    object 
 1    Date                                     267 non-null    object 
 2    Frequency                                267 non-null    object 
 3    Estimated Unemployment Rate (%)          267 non-null    float64
 4    Estimated Employed                       267 non-null    int64  
 5    Estimated Labour Participation Rate (%)  267 non-null    float64
 6   Region.1                                  267 non-null    object 
 7   longitude                                 267 non-null    float64
 8   latitude                                  267 non-null    float64
dtypes: float64(4), int64(1), object(4)
memory usage: 18.9+ KB
In [41]:
india.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 7 columns):
 #   Column                                    Non-Null Count  Dtype  
---  ------                                    --------------  -----  
 0   Region                                    740 non-null    object 
 1    Date                                     740 non-null    object 
 2    Frequency                                740 non-null    object 
 3    Estimated Unemployment Rate (%)          740 non-null    float64
 4    Estimated Employed                       740 non-null    float64
 5    Estimated Labour Participation Rate (%)  740 non-null    float64
 6   Area                                      740 non-null    object 
dtypes: float64(3), object(4)
memory usage: 42.1+ KB
In [6]:
df.isnull().sum()
Out[6]:
Region                                      0
 Date                                       0
 Frequency                                  0
 Estimated Unemployment Rate (%)            0
 Estimated Employed                         0
 Estimated Labour Participation Rate (%)    0
Region.1                                    0
longitude                                   0
latitude                                    0
dtype: int64
In [7]:
df.describe()
Out[7]:
Estimated Unemployment Rate (%) Estimated Employed Estimated Labour Participation Rate (%) longitude latitude
count 267.000000 2.670000e+02 267.000000 267.000000 267.000000
mean 12.236929 1.396211e+07 41.681573 22.826048 80.532425
std 10.803283 1.336632e+07 7.845419 6.270731 5.831738
min 0.500000 1.175420e+05 16.770000 10.850500 71.192400
25% 4.845000 2.838930e+06 37.265000 18.112400 76.085600
50% 9.650000 9.732417e+06 40.390000 23.610200 79.019300
75% 16.755000 2.187869e+07 44.055000 27.278400 85.279900
max 75.850000 5.943376e+07 69.690000 33.778200 92.937600
In [42]:
india.describe()
Out[42]:
Estimated Unemployment Rate (%) Estimated Employed Estimated Labour Participation Rate (%)
count 740.000000 7.400000e+02 740.000000
mean 11.787946 7.204460e+06 42.630122
std 10.721298 8.087988e+06 8.111094
min 0.000000 4.942000e+04 13.330000
25% 4.657500 1.190404e+06 38.062500
50% 8.350000 4.744178e+06 41.160000
75% 15.887500 1.127549e+07 45.505000
max 76.740000 4.577751e+07 72.570000
In [8]:
df.duplicated().sum()
Out[8]:
0
In [43]:
india.duplicated().sum()
Out[43]:
27
In [9]:
df.columns =['States','Date','Frequency','Estimated Unemployment Rate','Estimated Employed','Estimated Labour Participation Rate','Region','longitude','latitude']
In [10]:
df.columns
Out[10]:
Index(['States', 'Date', 'Frequency', 'Estimated Unemployment Rate',
       'Estimated Employed', 'Estimated Labour Participation Rate', 'Region',
       'longitude', 'latitude'],
      dtype='object')
In [44]:
india.columns =['States','Date','Frequency','Estimated Unemployment Rate','Estimated Employed','Estimated Labour Participation Rate','Area']
In [45]:
india.columns
Out[45]:
Index(['States', 'Date', 'Frequency', 'Estimated Unemployment Rate',
       'Estimated Employed', 'Estimated Labour Participation Rate', 'Area'],
      dtype='object')
In [11]:
df.head(2)
Out[11]:
States Date Frequency Estimated Unemployment Rate Estimated Employed Estimated Labour Participation Rate Region longitude latitude
0 Andhra Pradesh 31-01-2020 M 5.48 16635535 41.02 South 15.9129 79.74
1 Andhra Pradesh 29-02-2020 M 5.83 16545652 40.90 South 15.9129 79.74
In [46]:
india.head(2)
Out[46]:
States Date Frequency Estimated Unemployment Rate Estimated Employed Estimated Labour Participation Rate Area
0 Andhra Pradesh 31-05-2019 Monthly 3.65 11999139.0 43.24 Rural
1 Andhra Pradesh 30-06-2019 Monthly 3.05 11755881.0 42.05 Rural
In [12]:
df['Date'] = pd.to_datetime(df['Date'],dayfirst=True)
In [13]:
df['Frequency']= df['Frequency'].astype('category')
In [14]:
df['Month'] =  df['Date'].dt.month
In [15]:
df['MonthNumber'] = df['Month'].apply(lambda x : int(x))
In [16]:
df['MonthName'] =  df['MonthNumber'].apply(lambda x: calendar.month_abbr[x])
In [17]:
df['Region'] = df['Region'].astype('category')
In [18]:
df.drop(columns='Month',inplace=True)
In [19]:
df.describe()
Out[19]:
Date Estimated Unemployment Rate Estimated Employed Estimated Labour Participation Rate longitude latitude MonthNumber
count 267 267.000000 2.670000e+02 267.000000 267.000000 267.000000 267.000000
mean 2020-06-16 09:15:30.337078528 12.236929 1.396211e+07 41.681573 22.826048 80.532425 5.535581
min 2020-01-31 00:00:00 0.500000 1.175420e+05 16.770000 10.850500 71.192400 1.000000
25% 2020-03-31 00:00:00 4.845000 2.838930e+06 37.265000 18.112400 76.085600 3.000000
50% 2020-06-30 00:00:00 9.650000 9.732417e+06 40.390000 23.610200 79.019300 6.000000
75% 2020-08-31 00:00:00 16.755000 2.187869e+07 44.055000 27.278400 85.279900 8.000000
max 2020-10-31 00:00:00 75.850000 5.943376e+07 69.690000 33.778200 92.937600 10.000000
std NaN 10.803283 1.336632e+07 7.845419 6.270731 5.831738 2.870915
In [20]:
round(df[['Estimated Unemployment Rate', 'Estimated Employed', 'Estimated Labour Participation Rate']].describe().T,2)
Out[20]:
count mean std min 25% 50% 75% max
Estimated Unemployment Rate 267.0 12.24 10.80 0.50 4.84 9.65 16.76 75.85
Estimated Employed 267.0 13962105.72 13366318.36 117542.00 2838930.50 9732417.00 21878686.00 59433759.00
Estimated Labour Participation Rate 267.0 41.68 7.85 16.77 37.26 40.39 44.06 69.69
In [21]:
regionStats = df.groupby(['Region'])[['Estimated Unemployment Rate',
                                      'Estimated Employed',
                                      'Estimated Labour Participation Rate']].mean().reset_index()

round(regionStats,2)
Out[21]:
Region Estimated Unemployment Rate Estimated Employed Estimated Labour Participation Rate
0 East 13.92 19602366.90 40.11
1 North 15.89 13072487.92 38.70
2 Northeast 10.95 3617105.53 52.06
3 South 10.45 14040589.33 40.44
4 West 8.24 18623512.72 41.26
In [22]:
heatMap = df[['Estimated Unemployment Rate', 'Estimated Employed', 
              'Estimated Labour Participation Rate', 'longitude', 'latitude', 'MonthNumber']]

heatMap = heatMap.corr()

plt.figure(figsize=(23,8))
sns.heatmap(heatMap, annot=True,cmap='magma', fmt='.3f', linewidths=1)
plt.title('heatMap')
plt.show()
In [37]:
fig = px.box(
    df,
    x='States',
    y='Estimated Unemployment Rate',
    color='States',
    title='unemploymentRate',
    template='plotly'
)
fig.show()
In [23]:
#plotting a "Bar-plot" to find the "average unemployment rate in each state"
newDF = df[['Estimated Unemployment Rate','States']]

#grouping the dataframe by 'States' and finding the corresponding 'mean'
newDF = newDF.groupby('States').mean().reset_index()

#sorting the values in the dataframe
newDF = newDF.sort_values('Estimated Unemployment Rate')

fig = px.bar(newDF, 
             x='States',
             y='Estimated Unemployment Rate',
             color='States',
             title='State-wise Average Employment Rate')
fig.show()
In [24]:
fig = px.bar(df, 
             x='Region',
             y='Estimated Unemployment Rate',
             animation_frame = 'MonthName',
             color='States',
             title='Region-wise Unemployment Rate',
             height=800)

fig.layout.updatemenus[0].buttons[0].args[1]["frame"]["duration"] = 1500

fig.show()
In [25]:
unempDF = df[['States','Region','Estimated Unemployment Rate','Estimated Employed','Estimated Labour Participation Rate']]

unempDF = unempDF.groupby(['Region','States'])['Estimated Unemployment Rate'].mean().reset_index()

#printing the new dataframe
unempDF.head(4)
Out[25]:
Region States Estimated Unemployment Rate
0 East Andhra Pradesh NaN
1 East Assam NaN
2 East Bihar 19.471
3 East Chhattisgarh NaN
In [26]:
fig = px.sunburst(unempDF, 
                  path=['Region','States'], 
                  values='Estimated Unemployment Rate',
                  title= 'unemployment rate in each region and state',
                  height=650)
fig.show()
In [27]:
#!pip install  sunburst
Requirement already satisfied: sunburst in c:\users\dell\anaconda3\annaconda2023\lib\site-packages (1.0.0a2)
Requirement already satisfied: matplotlib in c:\users\dell\anaconda3\annaconda2023\lib\site-packages (from sunburst) (3.7.2)
Requirement already satisfied: typing in c:\users\dell\anaconda3\annaconda2023\lib\site-packages (from sunburst) (3.7.4.3)
Requirement already satisfied: contourpy>=1.0.1 in c:\users\dell\anaconda3\annaconda2023\lib\site-packages (from matplotlib->sunburst) (1.0.5)
Requirement already satisfied: cycler>=0.10 in c:\users\dell\anaconda3\annaconda2023\lib\site-packages (from matplotlib->sunburst) (0.11.0)
Requirement already satisfied: fonttools>=4.22.0 in c:\users\dell\anaconda3\annaconda2023\lib\site-packages (from matplotlib->sunburst) (4.25.0)
Requirement already satisfied: kiwisolver>=1.0.1 in c:\users\dell\anaconda3\annaconda2023\lib\site-packages (from matplotlib->sunburst) (1.4.4)
Requirement already satisfied: numpy>=1.20 in c:\users\dell\anaconda3\annaconda2023\lib\site-packages (from matplotlib->sunburst) (1.24.3)
Requirement already satisfied: packaging>=20.0 in c:\users\dell\anaconda3\annaconda2023\lib\site-packages (from matplotlib->sunburst) (23.1)
Requirement already satisfied: pillow>=6.2.0 in c:\users\dell\anaconda3\annaconda2023\lib\site-packages (from matplotlib->sunburst) (9.4.0)
Requirement already satisfied: pyparsing<3.1,>=2.3.1 in c:\users\dell\anaconda3\annaconda2023\lib\site-packages (from matplotlib->sunburst) (3.0.9)
Requirement already satisfied: python-dateutil>=2.7 in c:\users\dell\anaconda3\annaconda2023\lib\site-packages (from matplotlib->sunburst) (2.8.2)
Requirement already satisfied: six>=1.5 in c:\users\dell\anaconda3\annaconda2023\lib\site-packages (from python-dateutil>=2.7->matplotlib->sunburst) (1.16.0)
In [28]:
fig = px.scatter_geo(df,'longitude', 'latitude', 
                     color="Region",
                     hover_name="States", 
                     size="Estimated Unemployment Rate",
                     animation_frame="MonthName",
                     scope='asia',
                     title='Lockdown Impact throughout India')

fig.layout.updatemenus[0].buttons[0].args[1]["frame"]["duration"] = 1200

#updating the geospatial axes ranges and ocean color
fig.update_geos(lataxis_range=[5,35], 
                lonaxis_range=[65, 100],
                oceancolor="#6dd5ed",
                showocean=True)

fig.show()
In [29]:
df47 = df[(df['MonthNumber'] >= 4) & (df['MonthNumber'] <=7)]

#filtering dataset between month 1 and 4 (inclusive) - before lockdown
df14 = df[(df['MonthNumber'] >= 1) & (df['MonthNumber'] <=4)]
In [30]:
df47g = df47.groupby('States')['Estimated Unemployment Rate'].mean().reset_index()

#grouping the dataframe on the basis of "States" and finding the corresponding mean values
df14g = df14.groupby('States')['Estimated Unemployment Rate'].mean().reset_index()

#clubbing the 2 dataframe values
df47g['Unemployment Rate before lockdown'] = df14g['Estimated Unemployment Rate']

#renaming the column values for better understanding
df47g.columns = ['States','unemploymentRate A/ lockdown','unemploymentRate B/ lockdown']

#displaying the top results
df47g.head()
Out[30]:
States unemploymentRate A/ lockdown unemploymentRate B/ lockdown
0 Andhra Pradesh 12.3975 9.4025
1 Assam 6.2450 6.2250
2 Bihar 30.8025 20.7425
3 Chhattisgarh 9.6025 7.2450
4 Delhi 24.3600 17.6975
In [31]:
df47g['% change in unemployment'] = round(df47g['unemploymentRate A/ lockdown'] - df47g['unemploymentRate B/ lockdown']/df47g['unemploymentRate B/ lockdown'],2)
In [32]:
df47g = df47g.sort_values('% change in unemployment')
In [33]:
fig = px.bar(df47g, x='States',y='% change in unemployment',
             color='% change in unemployment',
             title='% change in Unemployment A/ Lockdown')
In [34]:
def sort_impact(x):
    if x <= 10:
        #impactedState
        return ''
    
    elif x <= 20:
        #hardImpactedState
        return ''
    
    elif x <= 30:
        #harderImpactedState
        return ''
    
    elif x <= 40:
        #hardestImpactedState
        return ''
    
    return x    
In [35]:
df47g['impactStatus'] = df47g['% change in unemployment'].apply(lambda x:sort_impact(x))
In [36]:
fig = px.bar(df47g, 
             y='States',
             x='% change in unemployment',
             color='impactStatus',
             title='Lockdown Impact on Employment in India')

fig.show()
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]: